{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 278,
   "metadata": {},
   "outputs": [],
   "source": [
    "import cudf\n",
    "import nvstrings\n",
    "import rmm\n",
    "import numpy as np\n",
    "from collections import OrderedDict\n",
    "import time\n",
    "import os\n",
    "\n",
    "# Nobody likes redefining names and types, when most are identical\n",
    "def get_dtypes(fn, delim, floats):\n",
    "    with open(fn, errors='replace') as fp:\n",
    "        header = fp.readline().strip()\n",
    "    types = []\n",
    "    for col in header.split(delim):\n",
    "        if 'date' in col: types.append((col, 'date'))\n",
    "        elif col in floats: types.append((col, 'float64'))\n",
    "        else: types.append((col, 'int64'))\n",
    "    return OrderedDict(types)\n",
    "\n",
    "def get_df(fn, dtypes):\n",
    "    t0 = time.time()\n",
    "    size = os.path.getsize(fn)\n",
    "    df = cudf.io.read_csv(fn, names=list(dtypes), dtype=list(dtypes.values()), skiprows=1)\n",
    "    t1 = time.time()\n",
    "    rows = len(df)\n",
    "    print(str(size) + ' bytes in ' + str(t1-t0) + 's: ' + str(size/(t1-t0)/1000/1000) + ' mbytes/sec, ' + str(rows) + ' rows')\n",
    "    return df\n",
    "\n",
    "\n",
    "def get_df_and_strings(fn, dtypes):\n",
    "    t0 = time.time()\n",
    "    size = os.path.getsize(fn)\n",
    "    columns = cudf.io.read_csv_strings(fn, names=list(dtypes), dtype=list(dtypes.values()), skiprows=1)\n",
    "    df = cudf.dataframe.DataFrame()\n",
    "    strs = []\n",
    "    for idx,col in enumerate(columns):\n",
    "        name = list(dtypes.keys())[idx]\n",
    "        if type(columns[idx]) == cudf.series.Series:\n",
    "            df[name] = col\n",
    "        else:\n",
    "            strs.append(col)\n",
    "    t1 = time.time()\n",
    "    rows = len(df)\n",
    "    print(str(size) + ' bytes in ' + str(t1-t0) + 's: ' + str(size/(t1-t0)/1000/1000) + ' mbytes/sec, ' + str(rows) + ' rows')\n",
    "    return (df, strs)\n",
    "\n",
    "def hash_to_series(string):\n",
    "    #print(string)\n",
    "    word_array = rmm.device_array(string.size(), dtype=np.int32)\n",
    "    string.hash(word_array.device_ctypes_pointer.value)\n",
    "    return cudf.Series(word_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 305,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "77 bytes in 0.003477811813354492s: 0.022140358401316243 mbytes/sec, 4 rows\n"
     ]
    }
   ],
   "source": [
    "test_fn = 'string-test.csv'\n",
    "\n",
    "lines = ['number,text', '0,great and good', '1,weakest bad', '2,successfully superior', '3,winning']\n",
    "# must end with '\\n'\n",
    "with open(test_fn, 'w') as fp:\n",
    "    fp.write('\\n'.join(lines)+'\\n')\n",
    "    \n",
    "dtypes = get_dtypes(test_fn, ',', floats=[])\n",
    "dtypes['text'] = 'str'\n",
    "cols = get_df_and_strings(test_fn, dtypes)\n",
    "\n",
    "test_df = cols[0]\n",
    "words = cols[1][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 281,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df['score'] = np.zeros(len(test_df))\n",
    "test_df['ones'] = np.ones(len(test_df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 282,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  number score ones\n",
      "0      0   1.0  1.0\n",
      "1      1   1.0  1.0\n",
      "2      2   1.0  1.0\n",
      "3      3   1.0  1.0\n",
      "  number score ones\n",
      "0      0   2.0  1.0\n",
      "1      1   2.0  1.0\n",
      "2      2   2.0  1.0\n",
      "3      3   2.0  1.0\n",
      "  number score ones\n",
      "0      0   3.0  1.0\n",
      "1      1   3.0  1.0\n",
      "2      2   3.0  1.0\n",
      "3      3   3.0  1.0\n"
     ]
    }
   ],
   "source": [
    "for x in range(0, 3):\n",
    "    test_df['score'] = test_df['score'] + test_df['ones']\n",
    "    print(test_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 306,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "9169185 bytes in 0.027340412139892578s: 335.3711331447408 mbytes/sec, 86551 rows\n"
     ]
    }
   ],
   "source": [
    "# https://sraf.nd.edu/textual-analysis/resources/\n",
    "dict_fn = '/raid/stack_overflow/dict.csv'\n",
    "\n",
    "dtypes = get_dtypes(dict_fn, ',', ['Word Proportion', 'Average Proportion', 'Std Dev'])\n",
    "for col in ['Word', 'Source']:\n",
    "    dtypes[col] = 'str'\n",
    "\n",
    "res = get_df_and_strings(dict_fn, dtypes)\n",
    "tmp_df = res[0]\n",
    "sentiment_words = res[1][0]\n",
    "\n",
    "sentiment_df = cudf.dataframe.DataFrame()\n",
    "sentiment_df['word_hash'] = hash_to_series(sentiment_words.lower())\n",
    "sentiment_df['pos'] = tmp_df['Positive']\n",
    "sentiment_df['neg'] = tmp_df['Negative']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 217,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_csv(dict_fn)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 256,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Word</th>\n",
       "      <th>Sequence Number</th>\n",
       "      <th>Word Count</th>\n",
       "      <th>Word Proportion</th>\n",
       "      <th>Average Proportion</th>\n",
       "      <th>Std Dev</th>\n",
       "      <th>Doc Count</th>\n",
       "      <th>Negative</th>\n",
       "      <th>Positive</th>\n",
       "      <th>Uncertainty</th>\n",
       "      <th>Litigious</th>\n",
       "      <th>Constraining</th>\n",
       "      <th>Superfluous</th>\n",
       "      <th>Interesting</th>\n",
       "      <th>Modal</th>\n",
       "      <th>Irr_Verb</th>\n",
       "      <th>Harvard_IV</th>\n",
       "      <th>Syllables</th>\n",
       "      <th>Source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>84102</th>\n",
       "      <td>WINNING</td>\n",
       "      <td>84103</td>\n",
       "      <td>45485</td>\n",
       "      <td>0.000003</td>\n",
       "      <td>0.000002</td>\n",
       "      <td>0.000027</td>\n",
       "      <td>23755</td>\n",
       "      <td>0</td>\n",
       "      <td>2009</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>12of12inf</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          Word  Sequence Number  Word Count  Word Proportion  \\\n",
       "84102  WINNING            84103       45485         0.000003   \n",
       "\n",
       "       Average Proportion   Std Dev  Doc Count  Negative  Positive  \\\n",
       "84102            0.000002  0.000027      23755         0      2009   \n",
       "\n",
       "       Uncertainty  Litigious  Constraining  Superfluous  Interesting  Modal  \\\n",
       "84102            0          0             0            0            0      0   \n",
       "\n",
       "       Irr_Verb  Harvard_IV  Syllables     Source  \n",
       "84102         0           0          2  12of12inf  "
      ]
     },
     "execution_count": 256,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.query('Word == \"WINNING\"')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 297,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[98619021, 1223169806, 3649403303, 1349785232]\n"
     ]
    }
   ],
   "source": [
    "print(words.lower().split_column(' ')[0].hash())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 307,
   "metadata": {},
   "outputs": [],
   "source": [
    "#word_hash = sentiment_df['word_hash']\n",
    "#sentiment_df.drop_column('word_hash')\n",
    "\n",
    "test_df['score'] = np.zeros(len(test_df))\n",
    "\n",
    "word_hash = sentiment_df['word_hash']\n",
    "sentiment_df.drop_column('word_hash')\n",
    "\n",
    "idx = 0\n",
    "col = words.split_column(' ')[0]\n",
    "\n",
    "col_name = 'hash_' + str(idx)\n",
    "test_df[col_name] = hash_to_series(col.lower())\n",
    "sentiment_df[col_name] = word_hash\n",
    "test_df = test_df.merge(sentiment_df, how='left', on=[col_name], lsuffix='', rsuffix='_r')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 308,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      hash_0 number score  pos  neg\n",
      "0   98619021      0   0.0 2009    0\n",
      "1 1223169806      1   0.0    0 2009\n",
      "2 -645563993      2   0.0 2009    0\n",
      "3 1349785232      3   0.0 2009    0\n"
     ]
    }
   ],
   "source": [
    "print(test_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 309,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "column 'hash_0_r' does not exist",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-309-89e46bf18883>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mtest_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'neg'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mtest_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'hash_'\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m'_r'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      8\u001b[0m \u001b[0msentiment_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcol_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/conda/envs/cudf/lib/python3.5/site-packages/cudf-0+unknown-py3.5.egg/cudf/dataframe.py\u001b[0m in \u001b[0;36mdrop_column\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m    440\u001b[0m         \"\"\"\n\u001b[1;32m    441\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_cols\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 442\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mNameError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'column {!r} does not exist'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    443\u001b[0m         \u001b[0;32mdel\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_cols\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    444\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: column 'hash_0_r' does not exist"
     ]
    }
   ],
   "source": [
    "test_df['score'] = test_df['score'] + test_df['pos'] - test_df['neg']\n",
    "test_df['pos_'+str(idx)] = test_df['pos']\n",
    "test_df.drop_column('pos')\n",
    "test_df['neg_'+str(idx)] = test_df['neg']\n",
    "test_df.drop_column('neg')\n",
    "    \n",
    "#test_df.drop_column('hash_'+str(idx)+'_r')\n",
    "sentiment_df.drop_column(col_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 310,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      hash_0 number   score pos_0 neg_0\n",
      "0   98619021      0  2009.0  2009     0\n",
      "1 1223169806      1 -2009.0     0  2009\n",
      "2 -645563993      2  2009.0  2009     0\n",
      "3 1349785232      3  2009.0  2009     0\n"
     ]
    }
   ],
   "source": [
    "print(test_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 286,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[98619021, 1223169806, 3649403303, 1349785232]\n"
     ]
    }
   ],
   "source": [
    "print(columns[0].hash())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 277,
   "metadata": {},
   "outputs": [
    {
     "ename": "AssertionError",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-277-475edb8705ca>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msentiment_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"word_hash == 98619021\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m/conda/envs/cudf/lib/python3.5/site-packages/cudf-0+unknown-py3.5.egg/cudf/dataframe.py\u001b[0m in \u001b[0;36mquery\u001b[0;34m(self, expr)\u001b[0m\n\u001b[1;32m   1041\u001b[0m         \u001b[0mnewdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1042\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mcol\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1043\u001b[0;31m             \u001b[0mnewseries\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mselected\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1044\u001b[0m             \u001b[0mnewdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnewseries\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1045\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mnewdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/conda/envs/cudf/lib/python3.5/site-packages/cudf-0+unknown-py3.5.egg/cudf/series.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, arg)\u001b[0m\n\u001b[1;32m    182\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mSeries\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    183\u001b[0m             selvals, selinds = columnops.column_select_by_boolmask(\n\u001b[0;32m--> 184\u001b[0;31m                 self._column, arg)\n\u001b[0m\u001b[1;32m    185\u001b[0m             \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselinds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_gpu_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    186\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_copy_construct\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselvals\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/conda/envs/cudf/lib/python3.5/site-packages/cudf-0+unknown-py3.5.egg/cudf/columnops.py\u001b[0m in \u001b[0;36mcolumn_select_by_boolmask\u001b[0;34m(column, boolmask)\u001b[0m\n\u001b[1;32m    102\u001b[0m     \"\"\"\n\u001b[1;32m    103\u001b[0m     \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mnumerical\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mNumericalColumn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 104\u001b[0;31m     \u001b[0;32massert\u001b[0m \u001b[0mcolumn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnull_count\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m  \u001b[0;31m# We don't properly handle the boolmask yet\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    105\u001b[0m     \u001b[0mboolbits\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcudautils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompact_mask_bytes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mboolmask\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_gpu_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    106\u001b[0m     \u001b[0mindices\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcudautils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mboolmask\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mAssertionError\u001b[0m: "
     ]
    }
   ],
   "source": [
    "sentiment_df.query(\"word_hash == 98619021\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 223,
   "metadata": {},
   "outputs": [],
   "source": [
    "word_hash = sentiment_df['word_hash']\n",
    "sentiment_df.drop_column('word_hash')\n",
    "\n",
    "test_df['score'] = np.zeros(len(test_df))\n",
    "\n",
    "columns = words.split_column(' ')\n",
    "\n",
    "for idx, col in enumerate(columns):\n",
    "    col_name = 'hash_' + str(idx)\n",
    "    test_df[col_name] = hash_to_series(col.lower())\n",
    "    # cuDF joins operate on columns of the same name\n",
    "    sentiment_df[col_name] = word_hash\n",
    "    test_df = test_df.merge(sentiment_df, how='left', on=[col_name], lsuffix='', rsuffix='_r')\n",
    "    \n",
    "    test_df['score'] = test_df['score'] + test_df['pos'] - test_df['neg']\n",
    "    test_df['pos_'+str(idx)] = test_df['pos']\n",
    "    test_df.drop_column('pos')\n",
    "    test_df['neg_'+str(idx)] = test_df['neg']\n",
    "    test_df.drop_column('neg')\n",
    "    \n",
    "    sentiment_df.drop_column(col_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 228,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  number sentiment_score    hash_0l pos_0 neg_0    hash_1l pos_1 ... neg_2\n",
      "0      0             0.0   98619021     0     0      96727     0 ...     0\n",
      "1      1             0.0 -601795244     0     0      97285     0 ...     0\n",
      "2      2             0.0 1909191136     0     0 -602775453     0 ...     0\n",
      "3      3             0.0 1640548322     0     0          0     0 ...     0\n",
      "[3 more columns]\n"
     ]
    }
   ],
   "source": [
    "print(test_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "43399004 bytes in 0.5978922843933105s: 72.58666006041132 mbytes/sec, 358873 rows\n"
     ]
    }
   ],
   "source": [
    "# https://www.kaggle.com/ehallmar/beers-breweries-and-beer-reviews#reviews.csv\n",
    "beers_fn = '/raid/beer/beers.csv'\n",
    "dtypes = get_dtypes(beers_fn, ',',['abv'])\n",
    "for col in ['name', 'state', 'country', 'style', 'availability', 'notes']:\n",
    "    dtypes[col] = 'str'\n",
    "res = get_df_and_strings(beers_fn, dtypes)\n",
    "beers_df = res[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 315,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<cudf.Series nrows=116072 >"
      ]
     },
     "execution_count": 315,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "columns = res[1][0].split_column(' ')\n",
    "\n",
    "def create_value_counts(split_nvstring_object):    \n",
    "    # Calculate value_counts\n",
    "    temp = list(map(lambda x: cudf.Series(x.hash()), split_nvstring_object))\n",
    "    val_counts = cudf.multi.concat(temp).value_counts()\n",
    "    return val_counts\n",
    "\n",
    "create_value_counts(columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4184324 bytes in 0.027175188064575195s: 153.9758985312991 mbytes/sec, 50347 rows\n"
     ]
    }
   ],
   "source": [
    "breweries_fn = '/raid/beer/breweries.csv'\n",
    "for col in ['name', 'city', 'state', 'country', 'notes', 'types']: dtypes[col] = 'str'\n",
    "res = get_df_and_strings(breweries_fn, dtypes)\n",
    "breweries_df = res[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2318668689 bytes in 3.4482719898223877s: 672.4146748990729 mbytes/sec, 9073128 rows\n"
     ]
    }
   ],
   "source": [
    "beer_reviews_fn = '/raid/beer/reviews.csv'\n",
    "floats = ['look', 'smell', 'feel', 'taste', 'feel', 'overall', 'score']\n",
    "dtypes = get_dtypes(beer_reviews_fn, ',', floats)\n",
    "for col in ['username', 'text']: dtypes[col] = 'str'\n",
    "res = get_df_and_strings(beer_reviews_fn, dtypes)\n",
    "beer_reviews_df = res[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['\"\\xa0\\xa0 750 ml bottle', '\\xa0\\xa0']\n"
     ]
    }
   ],
   "source": [
    "print(res[1][1].sublist([0, 1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  beer_id                    date                  look                 smell                 taste                  feel                overall                 score\n",
      "0  271781 2017-03-17T00:00:00.000        201516393503.0 5.705451273030825e+18 7.411958951106261e+92 7.411972001369666e+39               707503.0          5728795633.0\n",
      "1  125646 2017-12-21T00:00:00.000                   4.5                   4.5                   4.5                   4.5                    4.5                   4.5\n",
      "2  125646 2017-12-21T00:00:00.000                  4.75                  4.75                  4.75                  4.75                   4.75                  4.75\n",
      "3  125646 2017-12-20T00:00:00.000 5.669973100634188e+71 7.503567215803741e+18              742269.0 5.560472431977581e+16 6.6966909571065816e+16 6.215391986342381e+17\n",
      "4  125646 2017-12-20T00:00:00.000                  4.25                   4.5                  4.25                  4.25                   4.25                  4.31\n"
     ]
    }
   ],
   "source": [
    "print(beer_reviews_df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "words = res[1][1].lower().split_column(' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1799936480 bytes in 2.383992910385132s: 755.009158021876 mbytes/sec, 16587830 rows\n"
     ]
    }
   ],
   "source": [
    "# https://www.kaggle.com/rtatman/ubuntu-dialogue-corpus\n",
    "dialogue_fn = '/raid/ubuntu/Ubuntu-dialogue-corpus/dialogueText_301.csv'\n",
    "\n",
    "dtypes = get_dtypes(dialogue_fn, ',', floats=[])\n",
    "#src user, dst user, message\n",
    "for col in ['from', 'to', 'text']: dtypes[col] = 'str'\n",
    "\n",
    "res = get_df_and_strings(dialogue_fn, dtypes)\n",
    "\n",
    "dialogue_df = res[0]\n",
    "text_strings = res[1][2]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
